1 package org.apache.lucene.util;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87 public class TestUnicodeUtil extends LuceneTestCase {
88 public void testCodePointCount() {
89
90 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0x80, 'z', 'z', 'z'));
91 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xc0 - 1, 'z', 'z', 'z'));
92
93 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf8, 'z', 'z', 'z'));
94 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xfc, 'z', 'z', 'z'));
95
96 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xc2));
97 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xe2));
98 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xe2, 0x82));
99 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf0));
100 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf0, 0xa4));
101 assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf0, 0xa4, 0xad));
102
103
104 assertEquals(0, UnicodeUtil.codePointCount(new BytesRef(asByteArray())));
105 assertEquals(3, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 'z', 'z'))));
106 assertEquals(2, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 0xc2, 0xa2))));
107 assertEquals(2, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 0xe2, 0x82, 0xac))));
108 assertEquals(2, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 0xf0, 0xa4, 0xad, 0xa2))));
109
110
111 int num = atLeast(50000);
112 for (int i = 0; i < num; i++) {
113 final String s = TestUtil.randomUnicodeString(random());
114 final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
115 final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
116 assertEquals(s.codePointCount(0, s.length()),
117 UnicodeUtil.codePointCount(new BytesRef(utf8, 0, utf8Len)));
118 }
119 }
120
121 private byte[] asByteArray(int... ints) {
122 byte [] asByteArray = new byte [ints.length];
123 for (int i = 0; i < ints.length; i++) {
124 asByteArray[i] = (byte) ints[i];
125 }
126 return asByteArray;
127 }
128
129 private void assertcodePointCountThrowsAssertionOn(byte... bytes) {
130 boolean threwAssertion = false;
131 try {
132 UnicodeUtil.codePointCount(new BytesRef(bytes));
133 } catch (IllegalArgumentException e) {
134 threwAssertion = true;
135 }
136 assertTrue(threwAssertion);
137 }
138
139 public void testUTF8toUTF32() {
140 int[] utf32 = new int[0];
141 int[] codePoints = new int[20];
142 int num = atLeast(50000);
143 for (int i = 0; i < num; i++) {
144 final String s = TestUtil.randomUnicodeString(random());
145 final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
146 final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
147 utf32 = ArrayUtil.grow(utf32, utf8Len);
148 final int utf32Len = UnicodeUtil.UTF8toUTF32(new BytesRef(utf8, 0, utf8Len), utf32);
149
150 int charUpto = 0;
151 int intUpto = 0;
152 while(charUpto < s.length()) {
153 final int cp = s.codePointAt(charUpto);
154 codePoints[intUpto++] = cp;
155 charUpto += Character.charCount(cp);
156 }
157 if (!ArrayUtil.equals(codePoints, 0, utf32, 0, intUpto)) {
158 System.out.println("FAILED");
159 for(int j=0;j<s.length();j++) {
160 System.out.println(" char[" + j + "]=" + Integer.toHexString(s.charAt(j)));
161 }
162 System.out.println();
163 assertEquals(intUpto, utf32Len);
164 for(int j=0;j<intUpto;j++) {
165 System.out.println(" " + Integer.toHexString(utf32[j]) + " vs " + Integer.toHexString(codePoints[j]));
166 }
167 fail("mismatch");
168 }
169 }
170 }
171
172 public void testNewString() {
173 final int[] codePoints = {
174 Character.toCodePoint(Character.MIN_HIGH_SURROGATE,
175 Character.MAX_LOW_SURROGATE),
176 Character.toCodePoint(Character.MAX_HIGH_SURROGATE,
177 Character.MIN_LOW_SURROGATE), Character.MAX_HIGH_SURROGATE, 'A',
178 -1,};
179
180 final String cpString = "" + Character.MIN_HIGH_SURROGATE
181 + Character.MAX_LOW_SURROGATE + Character.MAX_HIGH_SURROGATE
182 + Character.MIN_LOW_SURROGATE + Character.MAX_HIGH_SURROGATE + 'A';
183
184 final int[][] tests = { {0, 1, 0, 2}, {0, 2, 0, 4}, {1, 1, 2, 2},
185 {1, 2, 2, 3}, {1, 3, 2, 4}, {2, 2, 4, 2}, {2, 3, 0, -1}, {4, 5, 0, -1},
186 {3, -1, 0, -1}};
187
188 for (int i = 0; i < tests.length; ++i) {
189 int[] t = tests[i];
190 int s = t[0];
191 int c = t[1];
192 int rs = t[2];
193 int rc = t[3];
194
195 try {
196 String str = UnicodeUtil.newString(codePoints, s, c);
197 assertFalse(rc == -1);
198 assertEquals(cpString.substring(rs, rs + rc), str);
199 continue;
200 } catch (IndexOutOfBoundsException | IllegalArgumentException e1) {
201
202 }
203 assertTrue(rc == -1);
204 }
205 }
206
207 public void testUTF8UTF16CharsRef() {
208 int num = atLeast(3989);
209 for (int i = 0; i < num; i++) {
210 String unicode = TestUtil.randomRealisticUnicodeString(random());
211 BytesRef ref = new BytesRef(unicode);
212 CharsRefBuilder cRef = new CharsRefBuilder();
213 cRef.copyUTF8Bytes(ref);
214 assertEquals(cRef.toString(), unicode);
215 }
216 }
217
218 public void testCalcUTF16toUTF8Length() {
219 int num = atLeast(5000);
220 for (int i = 0; i < num; i++) {
221 String unicode = TestUtil.randomUnicodeString(random());
222 byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
223 int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
224 assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
225 }
226 }
227 }